# general libraries used
import sys
import pprint
import json
import re
import os
import shutil
import codecs
import time

# specific libraries used
import numpy as np
import pandas as pd
import tweepy
from tweepy import *
import psycopg2

# including the SMaSSD support files
sys.path.append("Classes")
sys.path.append(".")

#########################################
# MAIN CODE FOLLOWS
#########################################
from TextTweet import TextTweet
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
import keras.preprocessing.text as kr
from keras.preprocessing.text import Tokenizer
from keras.utils import np_utils
from sklearn.preprocessing import LabelEncoder
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from sklearn.metrics import confusion_matrix
import random

nltk.download('wordnet')
nltk.download('stopwords')

language = 'english'
en_stop = set(stopwords.words(language))
stemmer = SnowballStemmer(language)

tweets = []

testing = False
num_categories = 2
output_file = "partisan_nn_"

train_x = []
train_y = []
test_x = []
test_y = []
test_ids = []
done_ids = {}

conn = psycopg2.connect(database="shoom")
c = conn.cursor()
opp_mentions={}
c.execute('select tweet_id from congress_tweets where opp_mentions=1')
for r in c.fetchall():
    opp_mentions[str(r[0])]=True

c.execute("select t.id,t.text,t.original_text,d.partisan,d.testing from congress_tweets t, coded_tweets d where t.id=d.id and (testing='t' or training='t')")
for r in c.fetchall():
    text = r[1]
    if r[2] is not None and r[2]!='':
        text+=' '+r[2]
    tweet = TextTweet(r[0], text)
    tweet.tokenize()
    tweet.removeURLs()
    tweet.removeStopwords(en_stop)
    tweet.removeShortwords(3)
    tweet.lemmatizeWords()
    tweet.multiplyMentions(3)
    processed_text = ' '.join(tweet.words)
    if str(r[0]) in opp_mentions:
        processed_text += ' oppositionmention'

    if str(r[0]) not in done_ids:
        if testing and r[4]:
            test_x.append(processed_text)
            test_y.append(r[3])
            test_ids.append(r[0])
        else:
            train_x.append(processed_text)
            train_y.append(r[3])
        done_ids[str(r[0])]=True
c.close()
conn.close()

tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_x)
dictionary = tokenizer.word_index
wordArrays = []
for t in train_x:
    words = kr.text_to_word_sequence(t)
    wordArray = []
    for w in words:
        if w in dictionary:
            wordArray.append(dictionary[w])
    wordArrays.append(wordArray)
wordArrays = np.asarray(wordArrays)
train_x = tokenizer.sequences_to_matrix(wordArrays, mode='binary')
encoder = LabelEncoder()
train_y = encoder.fit_transform(train_y)
train_y = np_utils.to_categorical(train_y)
out_classes = list(encoder.classes_)

model = Sequential()
model.add(Dense(512, input_shape=(len(dictionary)+1,), activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(256, activation='tanh'))
model.add(Dropout(0.5))
model.add(Dense(num_categories, activation='sigmoid'))

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

# fitting time!
model.fit(train_x, train_y,
          batch_size=32,
          epochs=10,
          verbose=1,
          validation_split=0.2,
          shuffle=True)

if testing:
    predictions = []
    for t in test_x:
        words = kr.text_to_word_sequence(t)
        wordArray = []
        for w in words:
            if w in dictionary:
                wordArray.append(dictionary[w])
        raw_pred = model.predict(tokenizer.sequences_to_matrix([wordArray], mode='binary'))
        pred = out_classes[np.argmax(raw_pred)]
        predictions.append(pred)
    cm = confusion_matrix(test_y, predictions)
    pprint.pprint(len(predictions))
    pprint.pprint(len(test_y))
    pprint.pprint(cm)
    conn = psycopg2.connect(database="shoom")
    c = conn.cursor()
    i=0
    for p in predictions:
        in_p='f'
        if p:
            in_p='t'
        c.execute("UPDATE coded_tweets SET nn='"+in_p+"' WHERE id="+str(test_ids[i]))
        i+=1
    conn.commit()
    c.close()
    conn.close()
else:
    output_dictionary = output_file + "_dict.json"
    output_structure = output_file + "_structure.json"
    output_weights = output_file + "_weights.h5"
    with open(output_dictionary, 'w', encoding='utf-8') as f:
        json.dump(dictionary, f, ensure_ascii=False)
    with open(output_structure, 'w') as f:
        f.write(model.to_json())
    model.save_weights(output_weights)

